R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

odi=read.csv(file="odi-batting.csv",header=TRUE)

library(knitr)
## Warning: package 'knitr' was built under R version 3.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
library(plotly)
## Warning: package 'plotly' was built under R version 3.4.2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(scales)
## Warning: package 'scales' was built under R version 3.4.2
library(treemapify)
## Warning: package 'treemapify' was built under R version 3.4.3
data = as.Date(odi$MatchDate, format='%m-%d-%Y')
odi$Year = format(data,'%Y')
odi$Month = format(data, '%m')
odi$Weekday = format(data, '%A')

Top ten players by total runs

top_players = odi %>% group_by(Player) %>% summarise(Total_Runs = sum(Runs, na.rm=TRUE), Country=first(Country)) %>% arrange(-Total_Runs)
kable(top_players[1:5, ])
Player Total_Runs Country
Sachin R Tendulkar 18111 India
Ricky T Ponting 13686 Australia
Sanath T Jayasuriya 13430 Sri Lanka
Inzamam-ul-Haq 11739 Pakistan
Jacques H Kallis 11372 South Africa
## Countrywise total n umber of play ers
country_players = odi %>% group_by(Country) %>% summarise(Count_Players=n_distinct(Player)) %>% arrange(-Count_Players)
head(country_players)
## # A tibble: 6 x 2
##       Country Count_Players
##        <fctr>         <int>
## 1     England           220
## 2       India           185
## 3   Australia           179
## 4    Pakistan           179
## 5 New Zealand           160
## 6 West Indies           159
odi = odi %>% mutate(ducks=if_else(Runs==0, 1, 0))
odi = odi %>% mutate(centuries=if_else(Runs>99, 1, 0))
odi = odi %>% mutate(missed=if_else(Runs>90 & Runs<100, 1, 0))
odi = odi %>% mutate(fifties=if_else(Runs>50 & Runs<100, 1, 0))


players_summary = odi %>% group_by(Player) %>% summarise(Total_Runs=sum(Runs, na.rm=TRUE), Centuries=sum(centuries, na.rm=TRUE ), ducks=sum(ducks, na.rm=TRUE), fifties=sum(fifties, na.rm=TRUE), Missed_Centuries=sum(missed, na.rm=TRUE)) %>% arrange(-Total_Runs)
kable(head(players_summary))
Player Total_Runs Centuries ducks fifties Missed_Centuries
Sachin R Tendulkar 18111 48 20 93 17
Ricky T Ponting 13686 30 20 80 5
Sanath T Jayasuriya 13430 28 34 65 6
Inzamam-ul-Haq 11739 10 20 80 2
Jacques H Kallis 11372 17 17 81 8
Sourav C Ganguly 11363 22 16 70 3
# Bar chart for total runs by month
odi %>% group_by(Month) %>% summarise(runs=sum(Runs,na.rm=T)) %>% ggplot(aes(x=Month,y=runs))+geom_bar(stat= "identity")

odi %>% filter( Player=='Sachin R Tendulkar') %>% ggplot(aes(x=Runs,y=ScoreRate))+geom_point()+geom_smooth()
## `geom_smooth()` using method = 'loess'

three player performance

top_player=c('Sachin R Tendulkar','Virender Sehwag','Sourav C Ganguly' )

odi %>% filter(Player %in% top_player) %>% ggplot(aes(x=Runs,y=ScoreRate,color=Player))+geom_smooth()
## `geom_smooth()` using method = 'loess'

top_players = odi %>% group_by(Player) %>% summarise(Runs=sum(Runs,na.rm=TRUE)) %>% arrange(-Runs) %>% head(10)

top_players
## # A tibble: 10 x 2
##                    Player  Runs
##                    <fctr> <int>
##  1     Sachin R Tendulkar 18111
##  2        Ricky T Ponting 13686
##  3    Sanath T Jayasuriya 13430
##  4         Inzamam-ul-Haq 11739
##  5       Jacques H Kallis 11372
##  6       Sourav C Ganguly 11363
##  7           Rahul Dravid 10889
##  8           Brian C Lara 10405
##  9 D P Mahela Jayawardene  9913
## 10        Mohammad Yousuf  9720
odi %>% filter(Player %in% top_players$Player) %>% ggplot(aes(x=Runs,y=ScoreRate,color=Player))+geom_smooth()
## `geom_smooth()` using method = 'loess'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).

sachin total runs & total cen by year wise

sachin= odi %>% filter(Player=="Sachin R Tendulkar") %>% group_by(Year) %>% summarise(Total_run=sum(Runs),Total_cen=sum(centuries),Avg_Runs=mean(Runs,na.rm=TRUE)) 
g=ggplot(sachin,aes(x=Year,y=Total_run,size=Total_cen,color=-Avg_Runs))+geom_point()
ggplotly(g)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

top 10 players top10/total runs

#install(devtools)
#install(treemapify)

indian_player=odi %>% filter(Country=='India') %>% group_by(Player) %>% summarise(Total_Runs=sum(Runs,na.rm=TRUE),Avg_SR=mean(ScoreRate,na.rm=T)) %>% arrange(-Total_Runs) %>% head(50)

indian_player
## # A tibble: 50 x 3
##                 Player Total_Runs   Avg_SR
##                 <fctr>      <int>    <dbl>
##  1  Sachin R Tendulkar      18111 74.61213
##  2    Sourav C Ganguly      11363 64.90043
##  3        Rahul Dravid      10889 66.91160
##  4 Mohammad Azharuddin       9378 69.66844
##  5        Yuvraj Singh       8051 79.13948
##  6     Virender Sehwag       7760 93.87600
##  7    Mahendra S Dhoni       6497 85.30714
##  8 Alaysinhji D Jadeja       5359 66.71011
##  9      Navjot S Sidhu       4413 57.45315
## 10      Gautam Gambhir       4286 73.77817
## # ... with 40 more rows
g=ggplot(indian_player,aes(area=Total_Runs,label=Player,fill=Avg_SR))+geom_treemap()

g=g+geom_treemap_text()
plot(g)

#library(scales)
#scale will be between -1 to 1...so we need to check min &max vales and then fix the vales in b/w -1 to 1
 #(x=c(100,10,0,60,90)
 # rescale(x,c(-1,1))
 # mean(x)
  
 # min(indian_player$Avg_SR)
 # max(indian_player$Avg_SR)

indian_player$Avg_SR_Scaled=rescale(indian_player$Avg_SR,c(-1,1))

g=ggplot(indian_player,aes(area=Total_Runs,label=Player,fill=-Avg_SR_Scaled))+geom_treemap()

g=g+geom_treemap_text() + scale_fill_gradient2(low="red", mid="yellow",high="green")
plot(g)

g=g+geom_treemap_text() + scale_fill_gradient2(low="#F65314", mid="#FFBB00",high="#7CBB00")
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
plot(g)

ggplot(indian_player,aes(x=Player,y=Total_Runs,fill=-Avg_SR_Scaled))+geom_bar(stat='identity')+scale_fill_gradient2(low="#F65314",mid="#FFBB00",high="#7CBB00")

Create a new column by binning the Runs column using the following ranges and plot the frequency count of ranges

0-25, 26-50, 51-75, 76-100, 100+

a=odi %>% mutate(Runs,run_bins=if_else(Runs %in% 0:25,"0-25",if_else(Runs %in% 26:50,"26-50",if_else(Runs %in% 51:75,"51-75",if_else(Runs %in% 76:100,"76-100","100+")))))
 a %>% group_by(run_bins)%>% select(run_bins,Runs) %>% head(10)
## # A tibble: 10 x 2
## # Groups:   run_bins [5]
##    run_bins  Runs
##       <chr> <int>
##  1     100+   118
##  2     100+   110
##  3   76-100   100
##  4   76-100    82
##  5    51-75    57
##  6    51-75    55
##  7    26-50    37
##  8    26-50    34
##  9     0-25    20
## 10     0-25    16
b=a %>% group_by(run_bins) %>% summarise(frequency=n())

ggplot(b,aes(x=reorder(run_bins,-frequency),y=frequency))+geom_bar(stat="identity")

  1. Consider the top ten players by total runs and compute and visualize the number of centuries across years (use facet_grid or facet_wrap)
odi$Date = as.Date(odi$MatchDate, format="%m-%d-%Y")
odi$year = format(odi$Date, '%Y')

top_run=odi %>% group_by(Player) %>% summarise(tot_run=sum(Runs,na.rm=T)) %>% arrange(-tot_run) %>% head(10)

odi = odi %>% mutate(centuries=if_else(Runs>99, 1, 0))

a=odi %>%  filter(Player %in% top_run$Player,centuries==1) %>%group_by(Player,year)  %>% summarise(no_cen=sum(centuries))


ggplot(a, aes(x=year, y=no_cen)) + geom_bar(stat='identity') + facet_wrap(~Player)